# install.packages("readr")
# install.packages("knitr")
# install.packages("tidyverse")
# install.packages("plotly")
# install.packages("patchwork")
# install.packages("rmarkdown")
# install.packages("broom")
# install.packages("ggpubr")
# install.packages("kableExtra")
library(kableExtra)
library(readr)
library(knitr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.1.0
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(patchwork)
library(rmarkdown)
library(broom)
library(ggpubr)
library(kableExtra)
opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)
message("All packages loaded.")
## All packages loaded.
pretty_r2_table <- function(df, caption_text){
df %>%
kable("html", caption = caption_text) %>%
kable_styling(full_width = F) %>%
row_spec(0, bold = TRUE) %>%
column_spec(3, color = ifelse(df$p.value < 0.05, "red", "black"))
}
pretty_cor_table <- function(x, y, var1_name = NULL, var2_name = NULL) {
cor_res <- cor.test(x, y)
tibble(
Variable1 = var1_name,
Variable2 = var2_name,
Correlation = round(cor_res$estimate, 3),
p.value = signif(cor_res$p.value, 3)) %>%
kable("html", caption = paste0("Correlation: ", var1_name, " vs ", var2_name)) %>%
kable_styling(full_width = F) %>%
row_spec(0, bold = TRUE) %>%
column_spec(4, color = ifelse(cor_res$p.value < 0.05, "red", "black"))
}
#set up data for use
data <- read_csv("EdStats_v01.csv")
#clean up data to have USA data
data_clean <- data %>%
filter(grepl("USA", `Country code`)) %>%
select(-`2024`) #all na
#make sure no NA cols remain
colnames(data_clean)[colSums(is.na(data_clean)) == nrow(data_clean)]
## character(0)
write_csv(data_clean, "EdStats_USA.csv")
#filter for enrollment/attendance info
data <- read_csv("EdStats_USA.csv")
data_filter_attend <- data %>%
filter(grepl("total net enrolment rate|total net attendance rate", `Indicator name`, ignore.case = TRUE)) %>%
filter(!grepl("female|male", `Indicator name`)) %>%
filter(!grepl("adjusted gender parity index", `Indicator name`, ignore.case = TRUE)) %>%
select(where(~!all(is.na(.)))) %>% view() %>%
write_csv("EdStats_attend.csv")
#filter for num teachers and teaching staff compensation info
data <- read_csv("EdStats_USA.csv")
data_filter_teacher <- data %>%
filter(grepl("teachers in|teaching staff compensation", `Indicator name`, ignore.case = TRUE)) %>%
filter(!grepl("female|male", `Indicator name`)) %>%
filter(!grepl("percentage of qualified", `Indicator name`, ignore.case = TRUE)) %>%
select(where(~!all(is.na(.)))) %>% view() %>%
write_csv("EdStats_teacher.csv")
#add scatter plot? correlation matrix?
data_attend <- read_csv("EdStats_attend.csv")
#select years with most observations
data_attend_years <- data_attend %>%
select(!c(`1975`, `1986`, `1987`, `1990`, `1991`, `1993`, `1995`, `1996`, `1999`, `1994`))
#reshape data for visualization
data_long <- data_attend_years %>%
pivot_longer(cols=`2005`:`2022`, names_to="Year", values_to="Value") %>%
mutate(
Year=as.numeric(Year),
Type=case_when(
grepl("attendance", `Indicator name`, ignore.case=TRUE) ~ "Attendance",
grepl("enrolment", `Indicator name`, ignore.case=TRUE) ~ "Enrollment",
TRUE ~ "Other"
),
Level=case_when(
grepl("primary", `Indicator name`, ignore.case=TRUE) ~ "Elementary School",
grepl("lower secondary", `Indicator name`, ignore.case=TRUE) ~ "Middle School",
grepl("upper secondary", `Indicator name`, ignore.case=TRUE) ~ "High School",
TRUE ~ "Other"
)
) %>%
filter(Type!="Other", !is.na(Value))
#comparison plots
#all data line graphs
p_line_per_school_horizontal <- ggplot(data_long, aes(x=Year, y=Value, color=Type)) +
geom_line(size=1.2, na.rm=TRUE) +
geom_point(size=2, alpha=0.8) +
facet_grid(~Level, scales="fixed", switch="y") +
theme_minimal() +
theme(
strip.background=element_rect(fill="grey90", color=NA),
panel.spacing=unit(1, "lines")
) +
labs(
title="Attendance vs. Enrollment Trends in Schools Over Time (USA)",
y="Rate (%)",
color="Variable"
) +
scale_color_manual(values=c("Attendance"="#FF69B4", "Enrollment"="#3bbf8f")
)
ggsave("line_per_school_horizontal.png", p_line_per_school_horizontal)
p_line_interactive1 <- ggplotly(p_line_per_school_horizontal)
p_line_interactive1
#enrollment vs attendance
data_wide <- data_long %>%
pivot_wider(
names_from=Type,
values_from=Value
)
data_combined <- data_wide %>%
group_by(Level, Year) %>%
summarise(
Attendance = Attendance[!is.na(Attendance)],
Enrollment = Enrollment[!is.na(Enrollment)]
) %>%
ungroup()
###
model <- lm(Enrollment ~ Attendance, data=data_combined)
# summary(model) #not significant
by_year_r2 <- data_combined %>%
group_by(Year) %>%
do(glance(lm(Enrollment ~ Attendance, data=.))) %>%
select(Year,r.squared,p.value)
# by_year_r2 #not significant
pretty_r2_table(by_year_r2, "R² and p-values by Year (Red = significant, Black = not significant)")
R² and p-values by Year (Red = significant, Black = not significant)
|
Year
|
r.squared
|
p.value
|
|
2015
|
0.9997303
|
0.0104555
|
|
2016
|
0.9659338
|
0.1181787
|
|
2017
|
0.9330262
|
0.1666495
|
|
2020
|
0.3520319
|
0.5956316
|
|
2021
|
0.0093777
|
0.9382539
|
by_schooling_r2 <- data_combined %>%
group_by(Level) %>%
do(glance(lm(Enrollment ~ Attendance, data=.))) %>%
select(Level,r.squared,p.value)
# by_schooling_r2 #not significant
pretty_r2_table(by_schooling_r2, "R² and p-values by School Level (Red = significant, Black = not significant)")
R² and p-values by School Level (Red = significant, Black = not
significant)
|
Level
|
r.squared
|
p.value
|
|
Elementary School
|
0.1355655
|
0.5420218
|
|
High School
|
0.2899115
|
0.3491842
|
|
Middle School
|
0.0677148
|
0.6724550
|
####
eqn_text <- paste0(
"Trendline: y = ",
round(coef(model)[2],3),"x + ",
round(coef(model)[1],3),
"<br>R² = ",round(summary(model)$r.squared,3),
"<br>p = ",signif(summary(model)$coefficients[2,4],3)
)
enroll_attend_scatter <- ggplot(data_combined, aes(x=Attendance, y=Enrollment)) +
geom_point(aes(color=factor(Year)), size=2) +
geom_smooth(method=lm, aes(group=1, text=eqn_text)) +
theme_minimal() +
labs(title="All Attendance vs Enrollment Observations",
x="Attendance",
y="Enrollment",
color="Year")
enroll_attend_scatter_plotly <- ggplotly(enroll_attend_scatter, tooltip=c("text","x","y","color"))
enroll_attend_scatter_plotly
####
enroll_longitudinal <- ggplot(data_combined, aes(x=Year, y=Enrollment, color=Level)) +
geom_point(size=2) +
geom_smooth(method=lm, size=1, alpha=0.25) +
theme_minimal() +
labs(title=" Enrollment Observations Longitudinally",
x="Year",
y="Entrollemnt",
color="Level")
enroll_longitudinal_plotly <- ggplotly(enroll_longitudinal)
enroll_longitudinal_plotly
pretty_cor_table(data_combined$Year, data_combined$Enrollment, "Year", "Enrollment")
Correlation: Year vs Enrollment
|
Variable1
|
Variable2
|
Correlation
|
p.value
|
|
Year
|
Enrollment
|
0.099
|
0.725
|
###
attend_longitudinal <- ggplot(data_combined, aes(x=Year, y=Attendance, color=Level)) +
geom_point(size=2) +
geom_smooth(method=lm, size=1, alpha=0.25) +
theme_minimal() +
labs(title=" Attendance Observations Longitudinally",
x="Year",
y="Attendance",
color="Level")
attend_longitudinal_plotly <- ggplotly(attend_longitudinal)
attend_longitudinal_plotly
pretty_cor_table(data_combined$Year, data_combined$Attendance, "Year", "Attendance")
Correlation: Year vs Attendance
|
Variable1
|
Variable2
|
Correlation
|
p.value
|
|
Year
|
Attendance
|
-0.594
|
0.0196
|
###
#supplemental plots about the data itself
#maybe shrink them
p1 <- ggplot(data_long, aes(x=Type, fill=Type)) +
geom_bar() +
theme_minimal() +
labs(title="Count of Attendance vs Enrollment Observations",
x="Variable",
y="Count")
# p1
p1_plotly <- ggplotly(p1)
p1_plotly
ggsave("Observation_counts_variable_attend.png", p1)
p2<-ggplot(data_long, aes(x=factor(Year))) +
geom_bar(fill="#c562f0") +
theme_minimal() +
labs(title="Number of Observations Per Year (All Variables)",
x="Year",
y="Count")
# p2
p2_plotly <- ggplotly(p2)
p2_plotly
ggsave("Observation_counts_year_attend.png", p2)
p3<-ggplot(data_long, aes(x=Level, fill=Type)) +
geom_bar(position="dodge") +
theme_minimal() +
labs(title="Number of Observations Per School Level",
x="School Level",
y="Count",
fill="Type")
# p3
p3_plotly <- ggplotly(p3)
p3_plotly
ggsave("Observation_counts_school_attend.png", p3)
# combine_supp_plots <- p1+p2+p3
# combine_supp_plots
#tendency measures
summary_stats <- data_long %>%
group_by(Type, Level) %>%
summarise(
n = n(),
Mean = mean(Value, na.rm=TRUE),
Median = median(Value, na.rm=TRUE),
SD = sd(Value, na.rm=TRUE),
Variance = var(Value, na.rm=TRUE),
Minimum = min(Value, na.rm=TRUE),
Maximum = max(Value, na.rm=TRUE),
Range = Maximum - Minimum,
Q1 = quantile(Value, 0.25, na.rm=TRUE),
Q3 = quantile(Value, 0.75, na.rm=TRUE),
IQR = Q3 - Q1,
) %>%
arrange(Type, Level)
paged_table(summary_stats)
write_csv(summary_stats, "tendency_attend.csv")
data_teacher <- read_csv("EdStats_teacher.csv")
#select data with most observations and remove data not using yet
data_teacher <- data_teacher %>%
filter(!grepl("non-teaching staff", `Indicator name`, ignore.case = TRUE)) %>%
filter(!grepl("secondary public institutions (%)", `Indicator name`, ignore.case = TRUE)) %>% #no teacher count
select(!c(`1975`, `1986`, `1987`, `1990`, `1991`, `1993`, `1995`, `1996`, `1971`, `1972`, `1973`, `1974`, `1976`, `1977`, `1984`, `1985`, `1998`, `1992`, `1994`, `1998`, `2022`, `1988`))
#Set up data for visualization
data_teachers_long <- data_teacher %>%
pivot_longer(cols=`2014`:`2021`, names_to="Year", values_to="Value") %>%
mutate(
Year = as.numeric(Year),
Type = case_when(
grepl("number", `Indicator name`, ignore.case=TRUE) ~ "Number of Teachers",
grepl("compensation", `Indicator name`, ignore.case=TRUE) ~ "Compensation % of Total Expenditure",
TRUE ~ "Other"
),
Level = case_when(
grepl("pre-primary", `Indicator name`, ignore.case=TRUE) ~ "Pre-School",
grepl("primary", `Indicator name`, ignore.case=TRUE) ~ "Elementary School",
grepl("lower secondary", `Indicator name`, ignore.case=TRUE) ~ "Middle School",
grepl("upper secondary", `Indicator name`, ignore.case=TRUE) ~ "High School",
grepl("post-secondary", `Indicator name`, ignore.case=TRUE) ~ "Undergraduate School", #kinda estimating what post-secondary non-tertiary means
grepl("tertiary", `Indicator name`, ignore.case=TRUE) ~ "Post-Grad Schooling",
TRUE ~ "Other"
)
) %>%
filter(Type != "Other", Level != "Other", !is.na(Value))
data_teachers_long <- data_teachers_long %>%
mutate(Value_10k = ifelse(Type == "Number of Teachers", Value / 10000, Value))
data_teachers_long <- data_teachers_long %>%
mutate(Level = factor(Level, levels = c(
"Pre-School",
"Elementary School",
"Middle School",
"High School",
"Undergraduate School",
"Post-Grad Schooling"
)))
teachers <- data_teachers_long %>%
filter(Type == "Number of Teachers") %>%
select(Level, Year, Teachers_10k = Value_10k)
compensation <- data_teachers_long %>%
filter(Type == "Compensation % of Total Expenditure") %>%
select(Level, Year, Compensation_Percent = Value)
comp_teacher <- left_join(teachers, compensation, by = c("Level", "Year"))
###
model_teacher <- lm(Teachers_10k ~ Compensation_Percent, data=comp_teacher)
# summary(model_teacher)
by_year_r2_teacher <- comp_teacher %>%
group_by(Year) %>%
filter(!is.na(Teachers_10k) & !is.na(Compensation_Percent)) %>%
do(glance(lm(Teachers_10k ~ Compensation_Percent, data=.))) %>%
select(Year, r.squared, p.value)
by_year_r2_teacher_clean <- by_year_r2_teacher %>%
filter(!is.na(r.squared) & !is.na(p.value)) #had nas
pretty_r2_table(by_year_r2_teacher_clean, "R² and p-values by Year (Red = significant, Black = not significant)")
R² and p-values by Year (Red = significant, Black = not significant)
|
Year
|
r.squared
|
p.value
|
|
2015
|
0.2757049
|
0.3635882
|
|
2017
|
0.2428672
|
0.3989430
|
by_schooling_r2_teacher <- comp_teacher %>%
group_by(Level) %>%
filter(!is.na(Teachers_10k) & !is.na(Compensation_Percent)) %>%
do(glance(lm(Teachers_10k ~ Compensation_Percent, data=.))) %>%
select(Level, r.squared, p.value)
by_schooling_r2_teacher_clean <- by_schooling_r2_teacher %>%
filter(!is.na(r.squared) & !is.na(p.value)) #had nas
pretty_r2_table(by_schooling_r2_teacher_clean,
"R² and p-values by School Level (Red = significant, Black = not significant)")
R² and p-values by School Level (Red = significant, Black = not
significant)
|
Level
|
r.squared
|
p.value
|
|
Pre-School
|
0.3259720
|
0.2366458
|
|
Elementary School
|
0.3032885
|
0.2574385
|
|
Middle School
|
0.6895414
|
0.0407134
|
|
High School
|
0.7435832
|
0.0271316
|
####
#comparison plots
eqn_text_teacher <- paste0(
"Trendline: y = ",
round(coef(model_teacher)[2],3), "x + ",
round(coef(model_teacher)[1],3),
"<br>R² = ", round(summary(model_teacher)$r.squared,3),
"<br>p = ", signif(summary(model_teacher)$coefficients[2,4],3)
)
teacher_comp_scatter <- ggplot(comp_teacher, aes(x=Compensation_Percent, y=Teachers_10k)) +
geom_point(aes(color=factor(Year)), size=2) +
geom_smooth(method=lm, aes(group=1, text=eqn_text_teacher)) +
theme_minimal() +
labs(title="Number of Teachers vs Compensation (All School Levels)",
x="Compensation (% of Total Expenditure)",
y="Number of Teachers (x10k)",
color="Year")
teacher_comp_scatter_plotly <- ggplotly(teacher_comp_scatter, tooltip=c("text","x","y","color"))
teacher_comp_scatter_plotly
###
p_line_teacher <- ggplot(data_teachers_long, aes(x=Year, y=Value_10k, color=Type)) +
geom_line(size=1.2, na.rm=TRUE) +
geom_point(size=2, alpha=0.8) +
facet_wrap(~Level, nrow=3, scales="fixed") +
theme_minimal() +
theme(
strip.background=element_rect(fill="grey90", color=NA)
) +
labs(title="Teachers & Compensation Trends Over Time (USA)",
y="Variable",
color="Variable") +
scale_color_manual(
values = c("Number of Teachers" = "#FF69B4",
"Compensation % of Total Expenditure" = "#3bbf8f"),
labels = c("Number of Teachers\nIn 10,000s", "Teacher Compensation % of\nTotal Instituation Expenditure")
) +
scale_x_continuous(breaks = seq(2014, 2021, by = 2))
ggsave("line_teacherl.png", p_line_teacher)
p_teacher_interactive <- ggplotly(p_line_teacher)
p_teacher_interactive
###
teachers_longitudinal <- ggplot(comp_teacher, aes(x=Year, y=Teachers_10k, color=Level)) +
geom_point(size=2) +
geom_smooth(method=lm, size=1, alpha=0.25) +
theme_minimal() +
labs(title="Number of Teachers Longitudinally by School Level",
x="Year",
y="Number of Teachers (x10k)",
color="Level")
teachers_longitudinal_plotly <- ggplotly(teachers_longitudinal)
teachers_longitudinal_plotly
pretty_cor_table(comp_teacher$Year, comp_teacher$Teachers_10k, "Year", "Number of Teachers")
Correlation: Year vs Number of Teachers
|
Variable1
|
Variable2
|
Correlation
|
p.value
|
|
Year
|
Number of Teachers
|
-0.089
|
0.611
|
##
###I dont know wtf is happening here. Figure out later #####################################
# comp_teacher_clean <- comp_teacher %>%
# filter(!is.na(Compensation_Percent)) %>%
# filter(Level != "Post-Grad Schooling")
#
# compensation_longitudinal <- ggplot(comp_teacher_clean, aes(x=Year, y=Compensation_Percent, color=Level)) +
# geom_point(size=2) +
# geom_smooth(method=lm, size=1, alpha=0.25) +
# theme_minimal() +
# labs(title="Teacher Compensation % Longitudinally by School Level",
# x="Year",
# y="Number of Teachers (x10k)",
# color="Level")
#
# compensation_longitudinal_plotly <- ggplotly(compensation_longitudinal)
# compensation_longitudinal_plotly
#
# pretty_cor_table(comp_teacher$Year, comp_teacher_clean$Compensation_Percent, "Year", "Compensation (%)")
############################################################################################
###
#supplemental plots
# count per Type
p1 <-ggplot(data_teachers_long, aes(x=Type, fill = Type)) +
geom_bar() +
theme_minimal() +
labs(title="Count of Observations per Type", x="Variable", y="Count")
p1_plotly <- ggplotly(p1)
p1_plotly
ggsave("Observation_counts_variable_teacher.png", p1)
# count per Year
p2 <- ggplot(data_teachers_long, aes(x=factor(Year))) +
geom_bar(fill="#c562f0") +
theme_minimal() +
labs(title="Number of Observations per Year", x="Year", y="Count")
p2_plotly <- ggplotly(p2)
p2_plotly
ggsave("Observation_counts_year_teacher.png", p2)
# count per Level
p3<-ggplot(data_teachers_long, aes(x=Level, fill=Type)) +
geom_bar(position="dodge") +
theme_minimal() +
labs(title="Number of Observations per School Level", x="Level", y="Count", fill="Variable")
p3_plotly <- ggplotly(p3)
p3_plotly
ggsave("Observation_counts_school_teacher.png", p3)
#tendency
summary_stats_teachers <- data_teachers_long %>%
group_by(Type, Level) %>%
summarise(
n = n(),
Mean = mean(Value, na.rm=TRUE),
Median = median(Value, na.rm=TRUE),
SD = sd(Value, na.rm=TRUE),
Variance = var(Value, na.rm=TRUE),
Minimum = min(Value, na.rm=TRUE),
Maximum = max(Value, na.rm=TRUE),
Range = Maximum - Minimum,
Q1 = quantile(Value, 0.25, na.rm=TRUE),
Q3 = quantile(Value, 0.75, na.rm=TRUE),
IQR = Q3 - Q1,
) %>%
arrange(Type, Level)
paged_table(summary_stats_teachers)
write_csv(summary_stats_teachers, "tendency_teachers.csv")